rm(list = ls())
#-------------------------------------------------------------------------------------
options(warn=-1)
setwd('~/Dropbox/Research/compliance_blocking/replication/GOTV')
library(logr)
lf<-log_open("01_load_data.log")
#-------------------------------------------------------------------------------------
data = read.csv('Data/raw/GreenGerberNickerson_JP_2003-1_EDITED.csv', stringsAsFactors=TRUE)
#-------------------------------------------------------------------------------------
library(tidyverse)
library(DMwR)
log_print(sessionInfo())
#-------------------------------------------------------------------------------------
#Convert into indiactors:
data$voted01 = ifelse(data$voted01=="Abstained", 0, 1)
data$voted00 = ifelse(data$voted00=="Abstained", 0, 1)

data$T = ifelse(data$treatmen == "Control", 0, 1)
data$C = ifelse(data$contact == "No", 0, 1)

data$voted99 = as.numeric(data$voted99)-2
data$voted99 = ifelse(data$voted99 < 0, NA, data$voted99)
data$missing99 = ifelse(is.na(data$voted99), 1, 0)

data$primary = as.numeric(data$primary)-2
data$primary = ifelse(data$primary < 0, NA, data$primary)
data$missing_primary = ifelse(is.na(data$primary), 1, 0)

#Interpolate:
df <- data %>% dplyr::select(c(age, turf, voted99, voted00, missing99, voted01, famsize, C, T,
    primary, missing_primary))

#NOTE for replication: as of 7/18/23, the DMwR package is not available on CRAN
#The package can be downloaded from CRAN archives.
#The cleaned data file is also provided in the Dataverse.
set.seed(1)
df = DMwR::knnImputation(df)
df = data.frame(city = paste(data$city), df)

df = df%>% rename(Y = voted01)

df$city = paste(data$city)
df$city[which(df$city=="MINNEAPOLIS")] = "Minneapolis"
df$city[which(df$city=="DETROIT")] = "Detroit"
df$city[which(df$city=="COLUMBUS")] = "Columbus"
df$city[which(df$city=="ST PAUL")] = "St. Paul"
df$city = as.factor(df$city)

#Create age buckets:
df$age18 = ifelse(df$age <= 34, 1, 0)
df$age35 = ifelse(df$age <= 49 & df$age >=35, 1, 0)
df$age50= ifelse(df$age <= 64 & df$age >= 50, 1, 0)
df$age65= ifelse(df$age >= 65, 1, 0)

df$age65[df$city == "Columbus"] = 0
df$age35[df$city == "Columbus"] = 1

df<-data.frame(df, race = data$race, party = data$party)

df$age2 = df$age^2
set.seed(1)
df$primary<-rbinom(nrow(df), 1, df$primary)
df$voted99<-rbinom(nrow(df), 1, df$voted99)


save(df, file='Data/generated/gotv_cleaned.Rdata')

log_close()